import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sys
import importlib
sys.path.insert(0, '../')
import general_utils as gen_ut
sys.path.insert(0, '../3_Hashtag_study')
import hashtag_util as ut_ht
first_case = "2020-11-17" #2020-11-17-->The first case of infection ascertained by COVID-19 is recorded
def df_preparation(cols,date=first_case, filename='../tweets.csv'):
columns = list(set(cols + ['user_created_at']))
df = pd.read_csv(filename,low_memory=False, usecols=columns)
df['user_created_at'] = pd.to_datetime(df['user_created_at'], format="%a %b %d %X %z %Y")
#df['rt_user_created_at'] = pd.to_datetime(df['rt_user_created_at'], format="%a %b %d %X %z %Y")
dfSuspect = pd.DataFrame()
df1 = df.loc[df['user_created_at'] > date,:]
dfSuspect = pd.concat([df1,dfSuspect],ignore_index=True)
'''df1 = df.loc[df['rt_user_created_at'] > date,:]
dfSuspect = pd.concat([df1,dfSuspect],ignore_index=True)'''
'''df1 = df.loc[df['in_reply_to_screen_name'] == sus,:]
dfSuspect = pd.concat([df1,dfSuspect],ignore_index=True)'''
del df1
return dfSuspect
df = df_preparation(['urls','created_at'])
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
df
| created_at | user_created_at | urls | |
|---|---|---|---|
| 0 | 2020-11-23 18:59:45+00:00 | 2020-11-22 18:44:29+00:00 | [] |
| 1 | 2020-11-23 18:43:52+00:00 | 2020-11-17 08:58:10+00:00 | [] |
| 2 | 2020-11-23 18:01:32+00:00 | 2020-11-21 19:29:54+00:00 | [{'url': 'https://t.co/192140rjb0', 'expanded_... |
| 3 | 2020-11-23 17:38:28+00:00 | 2020-11-19 15:45:35+00:00 | [] |
| 4 | 2020-11-23 16:43:07+00:00 | 2020-11-17 07:21:19+00:00 | [{'url': 'https://t.co/0Kb7fMSRGG', 'expanded_... |
| ... | ... | ... | ... |
| 154320 | 2021-05-18 11:01:44+00:00 | 2021-02-12 13:40:33+00:00 | [{'url': 'https://t.co/Znpah1MkTS', 'expanded_... |
| 154321 | 2021-05-18 11:01:56+00:00 | 2021-01-17 11:24:36+00:00 | [] |
| 154322 | 2021-05-18 11:02:37+00:00 | 2021-03-30 16:23:15+00:00 | [{'url': 'https://t.co/OYfC0uCD3d', 'expanded_... |
| 154323 | 2021-05-18 11:04:15+00:00 | 2021-03-05 16:14:26+00:00 | [{'url': 'https://t.co/uOuh8tz2nz', 'expanded_... |
| 154324 | 2021-05-18 11:04:57+00:00 | 2020-12-16 07:54:15+00:00 | [] |
154325 rows × 3 columns
#Creating a map of all urls with the number of uses
listUrls = []
for s in df['urls']:
urls = gen_ut.get_string_json(s,'display_url')
for url in urls:
if url:
url = url.split("//")
url = url[0].split("/")
listUrls.append(url[0])
dfUrls = pd.DataFrame()
dfUrls['url'] = listUrls
dfUrls['count'] = 1
dfUrls = dfUrls.groupby('url').sum()
dfUrls.sort_values(['count'], axis = 0,inplace=True,ascending=False)
dfUrls
| count | |
|---|---|
| url | |
| twitter.com | 14525 |
| imolaoggi.it | 839 |
| dlvr.it | 470 |
| tgcom24.mediaset.it | 466 |
| ansa.it | 451 |
| ... | ... |
| camacoes.it | 1 |
| ilperiodicoblog.it | 1 |
| nuovosud.it | 1 |
| calabrianews.it | 1 |
| fratelli-italia.it | 1 |
2155 rows × 1 columns
n = 20
fig = px.histogram(dfUrls.head(n),y=dfUrls.head(n).index,x='count',
title="The most %d url used in the tweets"% n,orientation = 'h')
fig.update_yaxes(title='URL name')
fig.show()
#Creation of a dictionary of num of use per date
my_dict = {"url":[],"date":[], "count":[]};
for i in range(len(df)):
s = df.loc[i,'urls']
d = df.loc[i,'created_at']
url = gen_ut.get_string_json(s,'display_url')
if url:
url = url[0].split("//")
url = url[0].split("/")
my_dict["url"].append(url[0])
my_dict["date"].append(d)
my_dict["count"].append(1)
dfUseUrl = pd.DataFrame.from_dict(my_dict)
dfUseUrl['Week/Year'] = dfUseUrl['date'].apply(lambda x: "%d-%d" % (x.isocalendar()[1] , x.isocalendar()[0]))
dfUseUrl.drop(['date'], axis=1,inplace=True)
dfUseUrl = dfUseUrl.groupby(['Week/Year', 'url']).sum()
dfUseUrl.reset_index(inplace=True)
dfUseUrl['Week/Year'] =pd.to_datetime(dfUseUrl['Week/Year']+ '-1', format="%W-%Y-%w")
dfUseUrl.sort_values(['Week/Year'],axis = 0,inplace=True,ascending=True)
dfUseUrl
| Week/Year | url | count | |
|---|---|---|---|
| 4596 | 2020-11-23 | agi.it | 1 |
| 4597 | 2020-11-23 | fanpa.ge | 1 |
| 4598 | 2020-11-23 | huffp.st | 1 |
| 4599 | 2020-11-23 | la7.it | 1 |
| 4600 | 2020-11-23 | numero6.org | 1 |
| ... | ... | ... | ... |
| 4060 | 2021-05-17 | gpdp.it | 1 |
| 4061 | 2021-05-17 | grandeinganno.it | 8 |
| 4062 | 2021-05-17 | huffingtonpost.it | 2 |
| 4064 | 2021-05-17 | ift.tt | 1 |
| 4090 | 2021-05-17 | japantimes.co.jp | 1 |
6045 rows × 3 columns
#All in the same graphic
fig = go.Figure()
for w in dfUrls.head().index:
mask = dfUseUrl['url'] == w
fig.add_trace(go.Scatter(x=dfUseUrl.loc[mask,'Week/Year'], y=dfUseUrl.loc[mask,'count'],
mode='lines+markers',
name=w))
fig.update_layout(title='All url history use',xaxis_title='Date',yaxis_title='use count')
fig.show()
# All in different graphic
for w in dfUrls.head().index:
fig = go.Figure()
mask = dfUseUrl['url'] == w
fig.add_trace(go.Scatter(x=dfUseUrl.loc[mask,'Week/Year'], y=dfUseUrl.loc[mask,'count'],
mode='lines+markers',
name=w))
fig.update_layout(title="History use of url '%s'"%w,xaxis_title='Date',yaxis_title='use count')
fig.show()
df = df_preparation(['hashtags','created_at'])
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
#Creating a map of all hashtags with the number of uses
listHashtags = []
for s in df['hashtags']:
[ listHashtags.append(x) for x in gen_ut.get_string_json(s,'text') ]
dfHashtags = pd.DataFrame()
dfHashtags['hashtags'] = listHashtags
dfHashtags['count'] = 0
dfHashtags = dfHashtags.groupby('hashtags').count()
dfHashtags.sort_values(['count'],axis = 0,inplace=True,ascending=False)
dfHashtags
| count | |
|---|---|
| hashtags | |
| vaccino | 4517 |
| vaccini | 4122 |
| COVID19 | 3991 |
| AstraZeneca | 3093 |
| VaccinoAntiCovid | 1312 |
| ... | ... |
| benedettasaccenza | 1 |
| belluno | 1 |
| Depuydt | 1 |
| bellanotizia | 1 |
| 01Marzo | 1 |
9692 rows × 1 columns
importlib.reload(ut_ht)
dfUse = ut_ht.process_dfUse(df)
dfUse
| Week/Year | hashtag | count | |
|---|---|---|---|
| 19335 | 2020-11-23 | Arcuri | 1 |
| 19355 | 2020-11-23 | vaccino | 7 |
| 19354 | 2020-11-23 | vaccini | 1 |
| 19353 | 2020-11-23 | scienza | 1 |
| 19352 | 2020-11-23 | propagandalive | 1 |
| ... | ... | ... | ... |
| 4758 | 2021-05-17 | vaccino | 85 |
| 4757 | 2021-05-17 | vacciniamoci | 1 |
| 4756 | 2021-05-17 | vacciniMagnetici | 4 |
| 4754 | 2021-05-17 | vaccine | 6 |
| 2126 | 2021-05-17 | migranti | 1 |
21174 rows × 3 columns
ut_ht.visual_histogram(dfHashtags,1000,500,100)
hastagRemove = ['vaccin.*','covid.*','corona.*','astrazeneca','pfizer','sarscov2','sputnikv','moderna']
dfHashtagFiltered = dfHashtags
for r in hastagRemove:
mask = dfHashtagFiltered.index.str.lower().str.match(r) == True
dfHashtagFiltered.drop(dfHashtagFiltered[mask].index, inplace=True)
dfMoreFiltered = dfHashtagFiltered
hastagRemove = ['.*lombardia.*','draghi','conte','m5s','mattarella','salvini','speranza','renzi','lega','.*governo.*',
'.*moratti.*','zingaretti','scanzi','burioni','crisanti']
for r in hastagRemove:
mask = dfMoreFiltered.index.str.lower().str.match(r) == True
dfMoreFiltered.drop(dfMoreFiltered[mask].index, inplace=True)
ut_ht.visual_histogram(dfMoreFiltered,150,70,30)
ut_ht.visual_by_date_together(dfMoreFiltered,dfUse)
ut_ht.visual_by_date_split(dfMoreFiltered,dfUse)
df = df_preparation(['is_self_rt','user_screen_name'])
dfSelf = df.groupby('user_screen_name').sum()
dfSelf['all_rt'] = df.groupby('user_screen_name').count().iloc[:,0]
dfSelf.sort_values(['all_rt'],inplace=True)
dfSelf
| is_self_rt | all_rt | |
|---|---|---|
| user_screen_name | ||
| 00__Enigma__00 | 0 | 1 |
| SojournerQuest1 | 0 | 1 |
| Sol90407448 | 0 | 1 |
| SolariDaniele | 0 | 1 |
| Soldat2laverite | 0 | 1 |
| ... | ... | ... |
| manuel_y_jesus_ | 11 | 1105 |
| Anna302478978 | 0 | 1106 |
| Lorenzxo2 | 2 | 1187 |
| IoaproNaty | 0 | 1614 |
| ZombieBuster5 | 0 | 1737 |
20175 rows × 2 columns
n = 20
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Bar(y=dfSelf.tail(n).index, x=dfSelf.tail(n)['all_rt']-dfSelf.tail(n)['is_self_rt'],orientation='h', name = 'All retweet'), row=1, col=1)
fig.add_trace(go.Bar(y=dfSelf.tail(n).index, x=dfSelf.tail(n)['is_self_rt'],orientation='h', name = 'Self retweet'), row=1, col=1)
fig.update_layout(title="How many retweet are self retweet (the most 20 retweeter)")
fig.update_xaxes(title="Count of retweets")
fig.update_yaxes(title="Username")
fig.update_layout(
barmode="overlay",
bargap=0.1)
fig.show()
df = df_preparation(['is_self_rt','user_screen_name','rt_user_screen_name','in_reply_to_screen_name'])
df
| user_screen_name | user_created_at | in_reply_to_screen_name | rt_user_screen_name | is_self_rt | |
|---|---|---|---|---|---|
| 0 | MenaViola2 | 2020-11-22 18:44:29+00:00 | NaN | NaN | False |
| 1 | Andrea75987735 | 2020-11-17 08:58:10+00:00 | GioVanDal | NaN | False |
| 2 | McOzzy6 | 2020-11-21 19:29:54+00:00 | borghi_claudio | NaN | False |
| 3 | ReSantolao | 2020-11-19 15:45:35+00:00 | NaN | gustinicchi | False |
| 4 | 75spiritolibero | 2020-11-17 07:21:19+00:00 | NaN | NaN | False |
| ... | ... | ... | ... | ... | ... |
| 154320 | ExmoorOn | 2021-02-12 13:40:33+00:00 | NaN | Nibiru_xxx | False |
| 154321 | Dado46544020 | 2021-01-17 11:24:36+00:00 | NaN | robersperanza | False |
| 154322 | BernardMalcom2 | 2021-03-30 16:23:15+00:00 | NaN | Corriere | False |
| 154323 | Pasquins6 | 2021-03-05 16:14:26+00:00 | NaN | NaN | False |
| 154324 | RagDiabolik | 2020-12-16 07:54:15+00:00 | Cartabellotta | NaN | False |
154325 rows × 5 columns
dfRetweet = df.dropna(subset=['rt_user_screen_name']).copy()
dfRetweet.drop(columns=['in_reply_to_screen_name'],inplace=True,errors='ignore')
dfRetweet = dfRetweet.groupby('rt_user_screen_name').count()
dfRetweet.rename(columns={'user_screen_name':'all_rt'},inplace=True,errors='ignore')
dfRetweet['self_rt'] = df.dropna(subset=['rt_user_screen_name'
]).copy().groupby('rt_user_screen_name').sum().loc[:,'is_self_rt']
dfRetweet.drop(columns=['is_self_rt'],inplace=True,errors='ignore')
dfRetweet['real_rt'] = dfRetweet['all_rt'] - dfRetweet['self_rt']
dfRetweet.sort_values('real_rt',ascending=False,inplace=True)
dfRetweet
| all_rt | user_created_at | self_rt | real_rt | |
|---|---|---|---|---|
| rt_user_screen_name | ||||
| RobertoBurioni | 1864 | 1864 | 0 | 1864 |
| BarbaraRaval | 1291 | 1291 | 0 | 1291 |
| noitre32 | 1195 | 1195 | 0 | 1195 |
| ImolaOggi | 1097 | 1097 | 0 | 1097 |
| MinervaMcGrani1 | 883 | 883 | 0 | 883 |
| ... | ... | ... | ... | ... |
| EddieSgarbossa | 1 | 1 | 1 | 0 |
| Jorjii0 | 1 | 1 | 1 | 0 |
| Shubham36694193 | 1 | 1 | 1 | 0 |
| MINNIE545101562 | 1 | 1 | 1 | 0 |
| zumpapa29 | 1 | 1 | 1 | 0 |
9020 rows × 4 columns
n = 20
fig =px.histogram(dfRetweet.head(n),y=dfRetweet.head(n).index,x='real_rt',orientation='h')
fig.update_yaxes(title='username')
fig.update_layout(title="The most %d users retweeted (without self retweet)"%n)
fig.show()
dfReply = df.dropna(subset=['in_reply_to_screen_name']).copy()
dfReply.drop(columns=['rt_user_screen_name'],inplace=True,errors='ignore')
dfReply.drop(columns=['is_self_rt'],inplace=True,errors='ignore')
dfReply = dfReply.groupby('in_reply_to_screen_name').count()
dfReply.rename(columns={'user_screen_name':'count'},inplace=True,errors='ignore')
dfReply.sort_values('count',ascending=False,inplace=True)
dfReply
| count | user_created_at | |
|---|---|---|
| in_reply_to_screen_name | ||
| matteosalvinimi | 618 | 618 |
| borghi_claudio | 342 | 342 |
| Adnkronos | 299 | 299 |
| MediasetTgcom24 | 280 | 280 |
| repubblica | 277 | 277 |
| ... | ... | ... |
| Patou571 | 1 | 1 |
| PatoOsko | 1 | 1 |
| PastorellaGiu | 1 | 1 |
| Pasqual01029538 | 1 | 1 |
| zziocane66 | 1 | 1 |
10226 rows × 2 columns
n = 20
fig =px.histogram(dfReply.head(n),y=dfReply.head(n).index,x='count',orientation='h')
fig.update_yaxes(title='username')
fig.update_layout(title="The most %d users replied"%n)
fig.show()
df = df_preparation(['is_self_rt','user_screen_name','in_reply_to_screen_name','rt_user_screen_name'])
df1 = df.copy()
df1['sum_total_posts'] = 1
df1 = df1.groupby('user_screen_name').sum()
df = df.groupby('user_screen_name').count()
df['sum_total_posts'] = df1['sum_total_posts']
df['sum_self_rt'] = df1['is_self_rt']
del df1
df.rename(columns={'in_reply_to_screen_name':'num_reply','rt_user_screen_name':'num_rt'},inplace=True,errors='ignore')
df['num_in_reply'] = dfReply['count']
df.loc[df['num_in_reply'].isna(),'num_in_reply'] = 0
df['num_in_rt'] = dfRetweet['real_rt']
df.loc[df['num_in_rt'].isna(),'num_in_rt'] = 0
df
| user_created_at | num_reply | num_rt | is_self_rt | sum_total_posts | sum_self_rt | num_in_reply | num_in_rt | |
|---|---|---|---|---|---|---|---|---|
| user_screen_name | ||||||||
| 00__Enigma__00 | 1 | 1 | 0 | 1 | 1 | 0 | 0.0 | 0.0 |
| 00__Ribelle__00 | 6 | 4 | 0 | 6 | 6 | 0 | 0.0 | 1.0 |
| 012Cla | 5 | 0 | 5 | 5 | 5 | 0 | 0.0 | 0.0 |
| 01_entro_py | 3 | 2 | 0 | 3 | 3 | 0 | 0.0 | 0.0 |
| 02_cocco | 1 | 0 | 1 | 1 | 1 | 0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| zpollastra | 1 | 0 | 1 | 1 | 1 | 0 | 0.0 | 0.0 |
| zukkeroepepe | 2 | 0 | 1 | 2 | 2 | 0 | 0.0 | 0.0 |
| zumpapa29 | 7 | 3 | 3 | 7 | 7 | 1 | 0.0 | 0.0 |
| zvnipat | 3 | 3 | 0 | 3 | 3 | 0 | 0.0 | 0.0 |
| zzhang12101 | 3 | 0 | 2 | 3 | 3 | 0 | 0.0 | 0.0 |
20175 rows × 8 columns
df.describe()
| user_created_at | num_reply | num_rt | is_self_rt | sum_total_posts | sum_self_rt | num_in_reply | num_in_rt | |
|---|---|---|---|---|---|---|---|---|
| count | 20175.000000 | 20175.000000 | 20175.000000 | 20175.000000 | 20175.000000 | 20175.000000 | 20175.000000 | 20175.000000 |
| mean | 7.649318 | 1.734721 | 4.175316 | 7.649318 | 7.649318 | 0.047831 | 0.162429 | 0.303048 |
| std | 42.440474 | 9.319139 | 31.657150 | 42.440474 | 42.440474 | 1.584675 | 2.121597 | 7.912871 |
| min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 3.000000 | 1.000000 | 1.000000 | 3.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 1737.000000 | 507.000000 | 1303.000000 | 1737.000000 | 1737.000000 | 165.000000 | 181.000000 | 661.000000 |
df.sort_values('num_in_rt',ascending=False,inplace=True)
df = df.head(10)
df
| user_created_at | num_reply | num_rt | is_self_rt | sum_total_posts | sum_self_rt | num_in_reply | num_in_rt | |
|---|---|---|---|---|---|---|---|---|
| user_screen_name | ||||||||
| liliaragnar | 848 | 75 | 362 | 848 | 848 | 0 | 139.0 | 661.0 |
| ZombieBuster5 | 1737 | 158 | 1303 | 1737 | 1737 | 0 | 36.0 | 530.0 |
| sabrina__sf | 246 | 10 | 20 | 246 | 246 | 0 | 181.0 | 475.0 |
| viaggrego | 372 | 2 | 175 | 372 | 372 | 0 | 2.0 | 200.0 |
| rej_panta | 324 | 51 | 62 | 324 | 324 | 5 | 58.0 | 195.0 |
| IntoPandemic | 45 | 0 | 7 | 45 | 45 | 0 | 1.0 | 187.0 |
| AmbrosinoSalva3 | 422 | 139 | 5 | 422 | 422 | 0 | 36.0 | 186.0 |
| ResPubl79983835 | 664 | 18 | 450 | 664 | 664 | 104 | 19.0 | 158.0 |
| paradoxMKD | 358 | 70 | 207 | 358 | 358 | 0 | 9.0 | 139.0 |
| Valenti63339244 | 143 | 7 | 1 | 143 | 143 | 0 | 14.0 | 123.0 |
df = df_preparation(['user_screen_name'])
dfNovax = df_preparation(['user_screen_name'],filename='../tweets_novax.csv')
dfProvax = df_preparation(['user_screen_name'],filename='../tweets_provax.csv')
print("There are %d (%d%%) novax that have been created after the %s"%
(len(dfNovax),(len(dfNovax)/len(df))*100,first_case))
There are 61564 (39%) novax that have been created after the 2020-11-17
print("There are %d (%d%%) provax that have been created after the %s"%
(len(dfProvax),(len(dfProvax)/len(df))*100,first_case))
There are 14529 (9%) provax that have been created after the 2020-11-17